Upload the dataframes and alignment (in FASTA) to calculate the pairwise genetic distance matrix:
library("ape")
library("seqinr")
##
## Attaching package: 'seqinr'
## The following objects are masked from 'package:ape':
##
## as.alignment, consensus
library("FactoMineR")
library("factoextra")
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("corrplot")
## corrplot 0.92 loaded
RVAdistdf <- read.csv("RVAdistdf.csv", row.names=1) #the pairwise genetic distance dataframe
RVAmetadf <-read.csv("RVA-PCA_metadata.csv", row.names=1) #the metadata associated to sequences/individuals
RVAmetadf[RVAmetadf ==''] <- NA
RVAmetadf$AgeGroup <- as.factor(RVAmetadf$AgeGroup)
RVAmetadf$Genotype <- as.factor(RVAmetadf$Genotype)
RVAmetadf$ZIPCODE <- as.factor(RVAmetadf$ZIPCODE)
RVAmetadf$sex <- as.factor(RVAmetadf$sex)
RVAmetadf$COLLECTION <- as.factor(RVAmetadf$COLL_MONTH)
RVAmetadf$Conclusion <- as.factor(RVAmetadf$Conclusion)
RVAmetadf$age <- as.numeric(RVAmetadf$age)
Calculate the eigenvalues and principal component analysis:
#calculating the PCA
res.RVA <- PCA(RVAdistdf, scale.unit = TRUE, ncp = 4)
eig.val <- get_eigenvalue(res.RVA)
#To help determining the number of Principal Components we can see the Scree Plot, which is the plot of the eigenvalues ordered from largest to the smallest.
fviz_eig(res.RVA, addlabels = TRUE, ylim = c(0, 50))
Analysis of quality and contribution of individuals:
fviz_pca_ind(res.RVA, col.ind = "cos2", pointsize = "cos2", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), labelsize=2)
#To visualize the contribution of individuals to the first two principal components:
fviz_contrib(res.RVA, choice = "ind", axes = 1:2, labelsize=2) + theme(text = element_text(size=7))
Evaluating the PCA by clustering based on metadata information:
#Individuals by Age Group
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$AgeGroup, palette = "lancet", addEllipses = TRUE, legend.title = "Age Group")
#Individuals by Symptoms
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$Conclusion, palette = "lancet", addEllipses = TRUE, legend.title = "Symptoms")
## Warning: Removed 19 rows containing missing values (`geom_point()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
#Individuals by month and year of collection
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$COLLECTION, palette = "lancet", addEllipses = TRUE, legend.title = "Collection Month")
## Too few points to calculate an ellipse
#Individuals by Zip Code
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$ZIPCODE, addEllipses = TRUE, legend.title = "Zip Code")
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
## Too few points to calculate an ellipse
#Individuals by Gender
fviz_pca_ind(res.RVA, geom.ind = "point", col.ind = RVAmetadf$sex, palette = "lancet", addEllipses = TRUE, legend.title = "Sex")
Constructing a dendrogram and making the hierarchical clustering:
library("dendextend")
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following objects are masked from 'package:ape':
##
## ladderize, rotate
## The following object is masked from 'package:stats':
##
## cutree
library("svglite")
RVAdm<-dist(RVAdistdf, method = 'euclidean')
RVAhc<-hclust(RVAdm, method="complete") # simple dendrogram
#evaluating the number of clusters con average silhouette:
fviz_nbclust(RVAdistdf, FUNcluster=hcut, method="silhouette", k.max = 10)
fviz_nbclust(RVAdistdf, FUNcluster=hcut, method="wss")
cut_cmp <- cutree(RVAhc, k = 4) #applying the clustering by wss
plot(RVAhc, hang=-1, cex=0.2)
#plot with cluster rectangles within the clusters:
rect.hclust(RVAhc, k=4, border=2:4)
#save plot of dendrogram:
#svg(filename = "RVA_HC_PCA_euclidean.svg")
#plot(RVAhc, hang=-1, cex=0.2)
#rect.hclust(RVAhc, k=4, border=2:4) #if I want to see the clusters identified by a rectangle
#dev.off()
Evaluate the association of metadata with the clusters by hierarchical clustering:
#make a dataframe with the clusters and evaluating association with metadata using G test of independence between groups:
library("dplyr")
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:seqinr':
##
## count
## The following object is masked from 'package:ape':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("rcompanion")
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::count() masks seqinr::count()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::where() masks ape::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("ggpubr")
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:dendextend':
##
## rotate
##
## The following object is masked from 'package:ape':
##
## rotate
library("rstatix")
##
## Attaching package: 'rstatix'
##
## The following object is masked from 'package:stats':
##
## filter
library("AMR")
RVA_cluster <- merge(RVAmetadf, cut_cmp, by = 0)
RVA_cluster$y<-as.factor(RVA_cluster$y)
g.test(RVA_cluster$Conclusion, RVA_cluster$y)
##
## G-test of independence
##
## data: RVA_cluster$Conclusion and RVA_cluster$y
## X-squared = 1.1482, p-value = 0.7654
X1 = pairwiseNominalIndependence(as.matrix(table(RVA_cluster$Conclusion, RVA_cluster$y)), compare = "column", fisher = FALSE, chisq = FALSE, method = "bonferroni", digits = 3)
cldList(p.adj.Gtest ~ Comparison, data= X1, threshold = 0.05)
## Group Letter MonoLetter
## 1 1 a a
## 2 2 a a
## 3 3 a a
## 4 4 a a
ggplot(RVA_cluster, aes(x=Conclusion, fill=y)) + geom_bar(position = "fill")
g.test(RVA_cluster$ZIPCODE, RVA_cluster$y)
## Warning in g.test(RVA_cluster$ZIPCODE, RVA_cluster$y): G-statistic
## approximation may be incorrect due to E < 5
##
## G-test of independence
##
## data: RVA_cluster$ZIPCODE and RVA_cluster$y
## X-squared = 61.369, p-value = 0.05259
X2 = pairwiseNominalIndependence(as.matrix(table(RVA_cluster$ZIPCODE, RVA_cluster$y)), compare = "column", fisher = FALSE, chisq = FALSE, method = "bonferroni", digits = 3)
cldList(p.adj.Gtest ~ Comparison, data= X2, threshold = 0.05)
## Group Letter MonoLetter
## 1 1 a a
## 2 2 a a
## 3 3 a a
## 4 4 a a
ggplot(RVA_cluster, aes(x=ZIPCODE, fill=y)) + geom_bar(position = "fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
X3 = pairwiseNominalIndependence(as.matrix(table(RVA_cluster$sex, RVA_cluster$y)), compare = "column", fisher = FALSE, chisq = FALSE, method = "bonferroni", digits = 3)
cldList(p.adj.Gtest ~ Comparison, data= X3, threshold = 0.05)
## Group Letter MonoLetter
## 1 1 a a
## 2 2 a a
## 3 3 a a
## 4 4 a a
ggplot(RVA_cluster, aes(x=sex, fill=y)) + geom_bar(position = "fill")
X4 = pairwiseNominalIndependence(as.matrix(table(RVA_cluster$AgeGroup, RVA_cluster$y)), compare = "column", fisher = FALSE, chisq = FALSE, method = "bonferroni", digits = 3)
cldList(p.adj.Gtest ~ Comparison, data= X4, threshold = 0.05)
## Group Letter MonoLetter
## 1 1 ab ab
## 2 2 a a
## 3 3 b b
## 4 4 a a
ggplot(RVA_cluster, aes(x=AgeGroup, fill=y)) + geom_bar(position = "fill") + scale_x_discrete(limits = c("less5", "5-17", "18-64", "more65"))
#Further analysis with age as continuous variable:
res.kruskal <- RVA_cluster %>% kruskal_test(age ~ y) #Kruskal-Wallis Test
res.kruskal
## # A tibble: 1 × 6
## .y. n statistic df p method
## * <chr> <int> <dbl> <int> <dbl> <chr>
## 1 age 258 3.35 3 0.341 Kruskal-Wallis
#Wilcoxon’s test to calculate pairwise comparisons between group:
pwc <- RVA_cluster %>%
wilcox_test(age ~ y, p.adjust.method = "bonferroni")
pwc
## # A tibble: 6 × 9
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 age 1 2 35 106 1560. 0.159 0.954 ns
## 2 age 1 3 35 38 666 0.996 1 ns
## 3 age 1 4 35 79 1209 0.288 1 ns
## 4 age 2 3 106 38 2312 0.177 1 ns
## 5 age 2 4 106 79 4372 0.608 1 ns
## 6 age 3 4 38 79 1312 0.272 1 ns
#visualization: box plots with p-values:
pwc <- pwc %>% add_xy_position(x = "y")
ggboxplot(RVA_cluster, x = "y", y = "age") +
stat_pvalue_manual(pwc, hide.ns = TRUE) +
labs(
subtitle = get_test_label(res.kruskal, detailed = TRUE),
caption = get_pwc_label(pwc)
)